# Warnings
import warnings
warnings.filterwarnings('ignore')
# Basic
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from collections import Counter
# Statistic
import math
from scipy.stats import chi2_contingency
# Maps
import folium as fl
from folium import Choropleth, Circle, Marker
from folium.plugins import MarkerCluster, HeatMap
#Graphs
import networkx as nx
def load_data(files):
"""
Load the data from parquet format.
The output is a list of pandas dataframes.
"""
# Dataframe list:
data = []
# Iteration for loading each file:
for i in range(0,len(files)):
# Store the dataframes
reader = pd.read_parquet(files[i])
data.append(reader)
return data
def save_dfs_to_parquet(dataframes, filenames):
"""
Save the model to parquet.
The file path must end with '.pkl'.
"""
for df, file in zip(dataframes, filenames):
df.to_parquet(file)
def words_counter(df, column, n_words):
"""
Given a dataframe and a text column, returns a dataframe with the most common words.
"""
words = df[pd.notnull(df[column])][column].str.split().tolist()
# Flatten the list of lists into a single list of words
words = [word for sublist in words for word in sublist]
# Remove the commas from the words
words = [word.replace(',', '') for word in words]
# Count the frequency of each word
word_counts = Counter(words)
# Get the 15 most common words
most_common_words = word_counts.most_common(n_words)
# Return the most common words
return pd.DataFrame(most_common_words, columns=['word', 'count'])
def get_deviation_of_mean_perc(df, list_var_continuous, target, multiplier):
"""
Devuelve el porcentaje de valores que exceden del intervalo de confianza
:type series:
:param multiplier:
:return:
"""
pd_final = pd.DataFrame()
for i in list_var_continuous:
series_mean = df[i].mean()
series_std = df[i].std()
std_amp = multiplier * series_std
left = series_mean - std_amp
right = series_mean + std_amp
size_s = df[i].size
perc_goods = df[i][(df[i] >= left) & (df[i] <= right)].size/size_s
perc_excess = df[i][(df[i] < left) | (df[i] > right)].size/size_s
if perc_excess>0:
pd_concat_percent = pd.DataFrame(df[target][(df[i] < left) | (df[i] > right)]\
.value_counts(normalize=True).reset_index())
pd_concat_percent.sort_values(by = 'index',inplace=True)
pd_concat_percent = pd_concat_percent.T
pd_concat_percent.columns = pd_concat_percent.iloc[0,:]
pd_concat_percent = pd_concat_percent.drop('index',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_outlier_values'] = df[i][(df[i] < left) | (df[i] > right)].size
pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores extremos')
return pd_final
def extract_true_values(d):
"""
The input is a dict whose keys are attributes and its values are bool values if they have the attribute or not.
It returns the true keys.
"""
if d is None:
return 'None'
true_values = [k for k, v in d.items() if v]
if len(true_values) == 0:
return pd.np.nan
return ' + '.join(true_values)
def has_target_words(s):
"""
Given a string, it checks if target words are contained.
"""
target_words = ['garage', 'validated', 'valet']
if isinstance(s, str):
s = s.lower()
for word in target_words:
if word in s:
return 'Yes'
return 'No'
else:
return 'Unknown'
def cramers_corrected_stat(x,y):
"""
Calculate Cramers V statistic for categorial-categorial association.
"""
result=-1
if len(x.value_counts())==1 :
print("First variable is constant")
elif len(y.value_counts())==1:
print("Second variable is constant")
else:
conf_matrix=pd.crosstab(x, y)
if conf_matrix.shape[0]==2:
correct=False
else:
correct=True
chi2 = chi2_contingency(conf_matrix, correction=correct)[0]
n = sum(conf_matrix.sum())
phi2 = chi2/n
r,k = conf_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
result=np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))
return round(result,6)
def column_to_unique(df, column):
"""
Given a column that should have only unique values but doesn´t,
it transform it to to fulfill this condition.
"""
# Create a new column for the modified names
df['name2'] = ''
# Keep track of the names that have been encountered
encountered_names = set()
# Iterate through the rows
for index, row in df.iterrows():
name = row[column]
suffix = 0
modified_name = name
# Make the name unique by adding a suffix if necessary
while modified_name in encountered_names:
suffix += 1
modified_name = f"{name}_{suffix}"
# Update the modified_name column and add the name to the set of encountered names
df.at[index, 'name2'] = modified_name
encountered_names.add(modified_name)
# Drop the old name column and rename the modified_name column
df = df.drop(columns=[column])
df = df.rename(columns={'name2': column})
return df
def melt_categories(df):
"""
Unpivot a dataframe.
"""
# Melt the dataframe
df_melted = df.melt(id_vars=['name','city'],
value_vars=['American','Breakfast','Traditional','Sandwiches',
'Burgers','Mexican', 'Italian','Seafood','Chicken','Chinese',
'Meat','Japanese','Asian','Steakhouses','Fast','Mediterranean',
'Vegetarian','Soup','Thai','Southern','Cajun/Creole', 'Bars',
'Breweries','Vegan','Vietnamese','Indian','Greek','Buffets'],
var_name='Category', value_name='Value')
# Filter out rows where value is not 1
df_filtered = df_melted[df_melted['Value'] == '1']
# Drop the 'Value' column
df_transformed = df_filtered.drop(columns=['Value'])
return df_transformed
def graph_restaurants_categories(df):
"""
It creates a graph filtered by city.
"""
# Choose city to plot
city_filter = input('Choose the city to filter the graph: \n')
df = df[df.city==city_filter]
g = nx.Graph()
g = nx.from_pandas_edgelist(df,source='name',target='Category')
print(nx.info(g))
plt.figure(figsize=(20, 20))
pos=nx.spring_layout(g, k=0.35)
nx.draw_networkx(g,pos,node_size=25, node_color='blue')
return plt.show()
## Data loading and display# List of dataset path-files
filenames = ["../data/restaurants.parquet",
"../data/reviews.parquet",
"../data/users.parquet"]
# loading the datasets
restaurants, reviews, users = load_data(filenames)
1.Dimensiones
print('The restaurants dataset has', restaurants.shape[0], 'different restaurants and', restaurants.shape[1], 'variables.')
restaurants.head()
The restaurants dataset has 52268 different restaurants and 14 variables.
| business_id | name | address | city | state | postal_code | latitude | longitude | stars | review_count | is_open | attributes | categories | hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | MTSW4McQd7CbVtyjqoe9mw | St Honore Pastries | 935 Race St | Philadelphia | PA | 19107 | 39.955505 | -75.155563 | 4.0 | 80 | 1 | {'AcceptsInsurance': None, 'AgesAllowed': None... | Restaurants, Food, Bubble Tea, Coffee & Tea, B... | {'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '... |
| 1 | CF33F8-E6oudUQ46HnavjQ | Sonic Drive-In | 615 S Main St | Ashland City | TN | 37015 | 36.269592 | -87.058945 | 2.0 | 6 | 1 | {'AcceptsInsurance': None, 'AgesAllowed': None... | Burgers, Fast Food, Sandwiches, Food, Ice Crea... | {'Friday': '9:0-0:0', 'Monday': '0:0-0:0', 'Sa... |
| 2 | k0hlBqXX-Bt0vf1op7Jr1w | Tsevi's Pub And Grill | 8025 Mackenzie Rd | Affton | MO | 63123 | 38.565166 | -90.321083 | 3.0 | 19 | 0 | {'AcceptsInsurance': None, 'AgesAllowed': None... | Pubs, Restaurants, Italian, Bars, American (Tr... | None |
| 3 | bBDDEgkFA1Otx9Lfe7BZUQ | Sonic Drive-In | 2312 Dickerson Pike | Nashville | TN | 37207 | 36.208103 | -86.768173 | 1.5 | 10 | 1 | {'AcceptsInsurance': None, 'AgesAllowed': None... | Ice Cream & Frozen Yogurt, Fast Food, Burgers,... | {'Friday': '6:0-16:0', 'Monday': '0:0-0:0', 'S... |
| 4 | eEOYSgkmpB90uNA7lDOMRA | Vietnamese Food Truck | Tampa Bay | FL | 33602 | 27.955269 | -82.456322 | 4.0 | 10 | 1 | {'AcceptsInsurance': None, 'AgesAllowed': None... | Vietnamese, Food, Restaurants, Food Trucks | {'Friday': '11:0-14:0', 'Monday': '11:0-14:0',... |
restaurants.dtypes
business_id object name object address object city object state object postal_code object latitude float32 longitude float32 stars object review_count int32 is_open object attributes object categories object hours object dtype: object
print(restaurants['business_id'].shape[0], 'rows,', restaurants['business_id'].drop_duplicates().shape[0],
'distinct restaurants ->',
str(restaurants['business_id'].shape[0]-restaurants['business_id'].drop_duplicates().shape[0]),
'duplicates.')
52268 rows, 52268 distinct restaurants -> 0 duplicates.
For this EDA, we are going to omit the business_id as it's an identificator. The columns attributes and hours will be omitted as well since their dtype are dicts. We will decide how to treat them later.
restaurants_eda = restaurants.copy()
restaurants_eda.drop(['business_id', 'attributes', 'hours'],axis = 1, inplace = True)
restaurants_eda.shape
(52268, 11)
# Categorical:
restaurants_eda.describe(include='object')
| name | address | city | state | postal_code | stars | is_open | categories | |
|---|---|---|---|---|---|---|---|---|
| count | 52268 | 52268 | 52268 | 52268 | 52268 | 52268 | 52268 | 52268 |
| unique | 36732 | 42064 | 920 | 19 | 2116 | 9 | 2 | 31680 |
| top | McDonald's | Philadelphia | PA | 70130 | 4.0 | 1 | Restaurants, Pizza | |
| freq | 703 | 443 | 5852 | 12641 | 602 | 13438 | 34987 | 935 |
# Numeric:
restaurants_eda.describe()
| latitude | longitude | review_count | |
|---|---|---|---|
| count | 52268.000000 | 52268.000000 | 52268.000000 |
| mean | 36.997627 | -87.842453 | 87.267142 |
| std | 6.011383 | 13.811094 | 188.939711 |
| min | 27.564457 | -120.083748 | 5.000000 |
| 25% | 32.217690 | -90.233246 | 13.000000 |
| 50% | 39.484108 | -86.035522 | 33.000000 |
| 75% | 39.958359 | -75.337572 | 89.000000 |
| max | 53.679195 | -74.664459 | 7568.000000 |
get_deviation_of_mean_perc(restaurants_eda, ['review_count'], target='stars', multiplier=3)
| index | 2.5 | 3.0 | 3.5 | 4.0 | 4.5 | 5.0 | variable | sum_outlier_values | porcentaje_sum_null_values |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.009777 | 0.030726 | 0.159218 | 0.505587 | 0.290503 | 0.00419 | review_count | 716 | 0.013699 |
There are very few outliers so we decided not to eliminate them in order not to lose information.
#Evaluating the null elements in the dataset:
pd_null_columnas = pd.DataFrame(restaurants_eda.isnull().sum().sort_values(ascending=False), columns=['nulos_columnas'])
pd_null_filas = pd.DataFrame(restaurants_eda.isnull().sum(axis=1).sort_values(ascending=False), columns=['nulos_filas'])
#Getting the proportions:
pd_null_columnas['Proporcion_columnas'] = pd_null_columnas['nulos_columnas']/restaurants_eda.shape[0]
pd_null_filas['Proporcion_filas'] = pd_null_filas['nulos_filas']/restaurants_eda.shape[1]
# nulls (rows) display:
pd_null_filas
| nulos_filas | Proporcion_filas | |
|---|---|---|
| 0 | 0 | 0.0 |
| 34849 | 0 | 0.0 |
| 34839 | 0 | 0.0 |
| 34840 | 0 | 0.0 |
| 34841 | 0 | 0.0 |
| ... | ... | ... |
| 17425 | 0 | 0.0 |
| 17426 | 0 | 0.0 |
| 17427 | 0 | 0.0 |
| 17428 | 0 | 0.0 |
| 52267 | 0 | 0.0 |
52268 rows × 2 columns
# nulls (columns) display:
pd_null_columnas
| nulos_columnas | Proporcion_columnas | |
|---|---|---|
| name | 0 | 0.0 |
| address | 0 | 0.0 |
| city | 0 | 0.0 |
| state | 0 | 0.0 |
| postal_code | 0 | 0.0 |
| latitude | 0 | 0.0 |
| longitude | 0 | 0.0 |
| stars | 0 | 0.0 |
| review_count | 0 | 0.0 |
| is_open | 0 | 0.0 |
| categories | 0 | 0.0 |
There are no null values in the restaurants data.
# getting the percetage for the stars column classes:
restaurants_eda['stars'] = restaurants_eda['stars'].astype('float32')
restaurants_percentage = restaurants_eda['stars']\
.value_counts(normalize=True)\
.mul(100).rename('Percentage').reset_index()
restaurants_percentage_count = restaurants_eda['stars'].value_counts().reset_index()
restaurants_percentage_distribution = pd.merge(restaurants_percentage, restaurants_percentage_count, on=['index'], how='inner')
restaurants_percentage_distribution.set_index('index').sort_index()
| Percentage | stars | |
|---|---|---|
| index | ||
| 1.0 | 0.472564 | 247 |
| 1.5 | 2.833474 | 1481 |
| 2.0 | 5.565547 | 2909 |
| 2.5 | 9.279100 | 4850 |
| 3.0 | 14.615061 | 7639 |
| 3.5 | 22.564475 | 11794 |
| 4.0 | 25.709803 | 13438 |
| 4.5 | 15.988750 | 8357 |
| 5.0 | 2.971225 | 1553 |
# plotting stars distribution:
fig = px.pie(restaurants_percentage_distribution, title='Stars distribution Pie', names="index", values='Percentage', hole=0.6)
fig.update_traces(textposition='outside')
fig.update_layout(legend_title_text=' Stars')
fig.show()
As we can see, 66% of the data are restaurants with more than 3 stars.
# getting the percetage for the is_open column classes:
is_open_percentage = restaurants_eda['is_open']\
.value_counts(normalize=True)\
.mul(100).rename('Percentage').reset_index()
is_open_percentage_count = restaurants_eda['is_open'].value_counts().reset_index()
is_open_percentage_distribution = pd.merge(is_open_percentage, is_open_percentage_count, on=['index'], how='inner')
is_open_percentage_distribution[['Percentage','is_open']]
| Percentage | is_open | |
|---|---|---|
| 0 | 66.937706 | 34987 |
| 1 | 33.062294 | 17281 |
# plotting target distribution:
fig = px.pie(is_open_percentage_distribution, names="index", values='Percentage', hole=0.6)
fig.show()
As we can see there are more open restaurants than closed in the data.
plt.figure(figsize=(12,8))
ax = sns.countplot(hue='stars', x='is_open' , data =restaurants_eda, palette='Paired')
ax.set_ylabel('Count')
ax.set_xlabel('Is Open')
ax.set_title('Number of Open/Closed Businesses by Star Rating')
num_open , num_closed = restaurants_eda['is_open'].value_counts()
ax.text(0.6, 8000,
'(1) Number of Businesses Open: {:.0f}'.format(num_open),
weight='bold',
color='blue',
size='small')
ax.text(0.6, 7500,
'(0) Number of Businesses Closed: {:.0f}'.format(num_closed),
weight='bold',
color='red',
size='small')
for p in ax.patches:
width, height = p.get_width(), p.get_height()
x, y = p.get_xy()
ax.text(x+width-.05,
y+height,
'{:.0f}'.format(height),
weight='bold',
horizontalalignment='center',
size='small')
plt.show()
The stars distribution doesn´t seem to be influenciated of the is Open variable since it has the same star rating distribution for both of its values
restaurants_eda.state.unique()
array(['PA', 'TN', 'MO', 'FL', 'IN', 'AB', 'NV', 'IL', 'AZ', 'LA', 'NJ',
'CA', 'DE', 'ID', 'NC', 'CO', 'HI', 'MT', 'XMS'], dtype=object)
As XMS is not an USA state, we will check the adress of those restaurantes with this value for the state column.
restaurants_eda[restaurants_eda.state=='XMS']
| name | address | city | state | postal_code | latitude | longitude | stars | review_count | is_open | categories | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 34288 | Pumpernickles Catering | 95a Mount Pleasant | Liverpool | XMS | L3 5TB | 38.67305 | -90.423157 | 4.0 | 5 | 1 | Caterers, Breakfast & Brunch, Convenience Stor... |
As XMS is not in USA nor Canada, we decide to drop this restaurant.
restaurants_eda = restaurants_eda.drop(index = restaurants_eda.index[restaurants_eda.state=='XMS'],
axis=0).reset_index(drop=True)
restaurants_eda.state.unique()
array(['PA', 'TN', 'MO', 'FL', 'IN', 'AB', 'NV', 'IL', 'AZ', 'LA', 'NJ',
'CA', 'DE', 'ID', 'NC', 'CO', 'HI', 'MT'], dtype=object)
restaurants_eda.shape
(52267, 11)
# Group data and sort by state for counting the businesses.
state_restaurants_counts = restaurants[['state', 'business_id']].groupby(['state'])\
['business_id'].agg('count').sort_values(ascending=False)
state_restaurants_counts = pd.DataFrame(data=state_restaurants_counts)
state_restaurants_counts.rename(columns={'business_id' : 'number_of_businesses'}, inplace=True)
state_restaurants_counts[:-1].sort_values(ascending=False, by="number_of_businesses")\
.plot(kind='barh', stacked=False, figsize=[10,10], colormap='winter')
plt.title('Top states by businesses listed')
Text(0.5, 1.0, 'Top states by businesses listed')
As we can see, the top 3 states with more restaurants are Pennsylvania, Florida and Tennessee.
# Group data and sort by state for counting the reviews.
state_restaurants_reviews = restaurants_eda[['state', 'review_count', 'stars']].groupby(['state']).\
agg({'review_count': 'sum', 'stars': 'mean'}).sort_values(by='review_count', ascending=False)
state_restaurants_reviews['review_count'].plot(kind='barh', stacked=False, figsize=[10,10],\
colormap='winter')
plt.title('Top states by reviews')
Text(0.5, 1.0, 'Top states by reviews')
As we can see, the top 3 states with more reviews are Pennsylvania, Florida and Los Angeles. Furthermore, it seems that if in a state the are more restaurants, there are also more reviews,something that we had already suspected before.
Here, we represent those states with a considerable number of reviews (>50K) sorted by the average star rating of their restaurants.
# Group data and sort by stars rating for counting the reviews.
state_restaurants_reviews[state_restaurants_reviews.review_count > 50000]['stars'].sort_values()\
.plot(kind='barh', stacked=False, figsize=[10,10], colormap='winter')
plt.title('States with greater than 50k reviews ranked by average stars')
Text(0.5, 1.0, 'States with greater than 50k reviews ranked by average stars')
California, Los Angeles and Florida has the highest average star rating on their reviews.
Now, we will analyze the same information grouped by cities instead of by state.
# Group data and sort by city for counting the businesses.
city_restaurants_counts = restaurants[['city', 'business_id']].groupby(['city'])\
['business_id'].agg('count').sort_values(ascending=False)
city_restaurants_counts = pd.DataFrame(data=city_restaurants_counts)
city_restaurants_counts.rename(columns={'business_id' : 'number_of_businesses'}, inplace=True)
city_restaurants_counts[0:49].sort_values(ascending=False, by="number_of_businesses")\
.plot(kind='barh', stacked=False, figsize=[10,10], colormap='winter')
plt.title('Top 50 cities by businesses listed')
Text(0.5, 1.0, 'Top 50 cities by businesses listed')
As we can see, the top 3 cities with more restaurants are Philadelphia, Tampa and Indianapolis.
# Group data and sort by city for counting the reviews.
city_restaurants_reviews = restaurants_eda[['city', 'review_count', 'stars']].groupby(['city']).\
agg({'review_count': 'sum', 'stars': 'mean'}).sort_values(by='review_count', ascending=False)
city_restaurants_reviews['review_count'][0:49].plot(kind='barh', stacked=False, figsize=[10,10],\
colormap='winter')
plt.title('Top 50 cities by reviews')
Text(0.5, 1.0, 'Top 50 cities by reviews')
As we can see, the top 3 cities with more reviews are Philadelphia, New Orleans and Nashville. Furthermore, it seems that if in a state the are more restaurants, there are also more reviews.Furthermore, it seems that if in a state the are more restaurants, there are also more reviews,something that we had already suspected before.
# Group data and sort by stars rating for counting the reviews.
city_restaurants_reviews[city_restaurants_reviews.review_count > 50000]['stars'].sort_values()\
.plot(kind='barh', stacked=False, figsize=[10,10], colormap='winter')
plt.title('Cities with greater than 50k reviews ranked by average stars')
Text(0.5, 1.0, 'Cities with greater than 50k reviews ranked by average stars')
Santa Barbara, New Orleans and Saint Petersburg has the highest average star rating on their reviews.The information by city is in good agreement with the information analyzed above by state, since Santa Barbara belongs to the state of California and Saint Petersburg belongs to the state of Florida.
We will further analyze the categories and attributes columns as we believe they contain valuable information for the model.
restaurants_eda = restaurants.copy()
restaurants_eda.drop(['business_id', 'hours'],axis = 1, inplace = True)
We have eliminated those observations belonging to the XMS state since this is erroneous information as it belongs to Europe.
restaurants_eda = restaurants_eda.drop(index = restaurants_eda.index[restaurants_eda.state=='XMS'],
axis=0).reset_index(drop=True)
restaurants_eda.shape
(52267, 12)
We analyze which are the most common words in categories.
words_counter(restaurants, 'categories', 100)
| word | count | |
|---|---|---|
| 0 | Restaurants | 52343 |
| 1 | Food | 28462 |
| 2 | & | 20298 |
| 3 | Bars | 16076 |
| 4 | American | 15046 |
| ... | ... | ... |
| 95 | Korean | 497 |
| 96 | Donuts | 475 |
| 97 | Flavor | 451 |
| 98 | Tapas/Small | 440 |
| 99 | Plates | 440 |
100 rows × 2 columns
We transform the categories column into a completely new one. We take as a reference the most frequent words analyzed previously and create new categories.
# Listing categories to focus:
keywords = ['American','Breakfast','Traditional','Sandwiches','Burgers',
'Mexican', 'Italian','Seafood','Chicken','Chinese','Meat',
'Japanese','Asian','Steakhouses','Fast','Mediterranean',
'Vegetarian','Soup','Thai','Southern','Cajun/Creole', 'Bars',
'Breweries','Vegan','Vietnamese','Indian','Greek','Buffets']
# Creating a column for each category individually, if a rows cointains
# that category, it will have the value 1, otherwise, it will be 0.
# If a rows doesn´t contain any of these categories, we create another
# column named 'Other'.
restaurants_eda = restaurants_eda.assign(**{keyword: ['1' if keyword.lower() in categories.lower()
else '0' for categories in restaurants_eda['categories']]
for keyword in keywords},
Other=['1' if all(keyword.lower() not in categories.lower()
for keyword in keywords)
else '0' for categories in restaurants_eda['categories']])
restaurants_eda = restaurants_eda.drop('categories',axis=1)
# plot horizontal barplot for each column
sum_df = (restaurants_eda.copy()).iloc[:,11:]
sum_df = sum_df.astype('float32')
sum_df = pd.DataFrame(sum_df.sum(), columns=['count'])
sum_df = sum_df.astype({'count': 'int64'})
sum_df = sum_df.sort_values(by='count')
sum_df
| count | |
|---|---|
| Meat | 228 |
| Breweries | 407 |
| Buffets | 622 |
| Greek | 779 |
| Indian | 838 |
| Vietnamese | 852 |
| Vegan | 856 |
| Cajun/Creole | 923 |
| Thai | 971 |
| Southern | 988 |
| Soup | 1061 |
| Vegetarian | 1158 |
| Mediterranean | 1263 |
| Steakhouses | 1506 |
| Asian | 1574 |
| Japanese | 1830 |
| Chinese | 3170 |
| Chicken | 3373 |
| Seafood | 3578 |
| Italian | 4573 |
| Mexican | 4614 |
| Burgers | 5636 |
| Breakfast | 6264 |
| Other | 7420 |
| Traditional | 8143 |
| Sandwiches | 8366 |
| Bars | 10934 |
| Fast | 12018 |
| American | 13066 |
# Plotting the numbers of business grouped by the new dummy categories
plt.figure(figsize=(10, 7))
sns.barplot(x='count', y=sum_df.index, data=sum_df, orient='h',color='blue')
plt.title('Dummy columns')
plt.show()
Now we are going to make use of graphs to check similarities of restaurants based on their categories. So we need a dataframe with the name, city and category of the restaurant
restaurants_graph = restaurants_eda.copy()
restaurants_graph = restaurants_graph[['name','city','American','Breakfast','Traditional','Sandwiches',
'Burgers','Mexican', 'Italian','Seafood','Chicken','Chinese',
'Meat','Japanese','Asian','Steakhouses','Fast','Mediterranean',
'Vegetarian','Soup','Thai','Southern','Cajun/Creole', 'Bars',
'Breweries','Vegan','Vietnamese','Indian','Greek','Buffets']]
restaurants_graph.head()
| name | city | American | Breakfast | Traditional | Sandwiches | Burgers | Mexican | Italian | Seafood | ... | Thai | Southern | Cajun/Creole | Bars | Breweries | Vegan | Vietnamese | Indian | Greek | Buffets | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | St Honore Pastries | Philadelphia | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | Sonic Drive-In | Ashland City | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | Tsevi's Pub And Grill | Affton | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3 | Sonic Drive-In | Nashville | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | Vietnamese Food Truck | Tampa Bay | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
5 rows × 30 columns
As we need unique restaurants, we modify the column name to make it unique (if two restaurants are "Mc'Donals", the second one would be "Mc'Donals_2"). Moreover, we unpivot the table to get one column named category in order to match the name with the category of the restaurant.
restaurants_graph = column_to_unique(restaurants_graph,'name')
restaurants_graph = melt_categories(restaurants_graph)
restaurants_graph.head()
| name | city | Category | |
|---|---|---|---|
| 2 | Tsevi's Pub And Grill | Affton | American |
| 5 | Denny's | Indianapolis | American |
| 18 | Craft Hall | Philadelphia | American |
| 22 | Caviar & Bananas | Nashville | American |
| 25 | Roman Forum | Tampa | American |
graph_restaurants_categories(restaurants_graph)
Choose the city to filter the graph: Sewell Graph with 89 nodes and 152 edges
As we can see, there is an 'asian' zone conected on the top of the graph (Chinese - Japanese - Asian - Thai). On the bottom-right we have another zone more 'Traditional' with sticks, burguers, sandwiches, italian, american, etc. There are other relations such us Italian and Mediterranean.
The attributes column is a dictionary. Let's analyze it further and transform it into several columns with the attributes that we think provide the most relevant information.
import warnings
# Ignore warnings
warnings.simplefilter("ignore")
pd.options.display.max_columns = 40
# Converting the dictionary to dataframe
attributes = restaurants_eda["attributes"].apply(pd.Series)
# Replace None values with NaN
attributes.fillna(value=pd.np.nan,inplace=True)
attributes.head()
| AcceptsInsurance | AgesAllowed | Alcohol | Ambience | BYOB | BYOBCorkage | BestNights | BikeParking | BusinessAcceptsBitcoin | BusinessAcceptsCreditCards | BusinessParking | ByAppointmentOnly | Caters | CoatCheck | Corkage | DietaryRestrictions | DogsAllowed | DriveThru | GoodForDancing | GoodForKids | GoodForMeal | HairSpecializesIn | HappyHour | HasTV | Music | NoiseLevel | Open24Hours | OutdoorSeating | RestaurantsAttire | RestaurantsCounterService | RestaurantsDelivery | RestaurantsGoodForGroups | RestaurantsPriceRange2 | RestaurantsReservations | RestaurantsTableService | RestaurantsTakeOut | Smoking | WheelchairAccessible | WiFi | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | u'none' | NaN | NaN | NaN | NaN | True | NaN | False | {'garage': False, 'street': True, 'validated':... | False | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | False | NaN | 1 | NaN | NaN | True | NaN | NaN | u'free' |
| 1 | NaN | NaN | u'none' | None | NaN | NaN | NaN | False | NaN | True | None | False | False | False | NaN | NaN | False | True | NaN | True | NaN | NaN | False | True | NaN | NaN | NaN | True | u'casual' | NaN | True | True | 1 | False | False | True | NaN | True | u'no' |
| 2 | NaN | NaN | u'full_bar' | {'romantic': False, 'intimate': False, 'touris... | NaN | NaN | NaN | NaN | NaN | True | {'garage': False, 'street': False, 'validated'... | NaN | True | NaN | NaN | NaN | NaN | NaN | NaN | True | NaN | NaN | NaN | True | NaN | u'average' | NaN | True | u'casual' | NaN | False | True | 1 | False | NaN | True | NaN | NaN | u'free' |
| 3 | NaN | NaN | u'none' | NaN | NaN | NaN | NaN | NaN | NaN | True | {'garage': False, 'street': False, 'validated'... | False | False | False | NaN | NaN | False | True | NaN | True | NaN | NaN | False | True | NaN | NaN | NaN | True | 'casual' | NaN | True | False | 1 | False | False | True | NaN | True | u'no' |
| 4 | NaN | NaN | 'none' | {'touristy': False, 'hipster': False, 'romanti... | NaN | NaN | NaN | NaN | NaN | NaN | {'garage': False, 'street': False, 'validated'... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | {'dessert': False, 'latenight': False, 'lunch'... | NaN | NaN | NaN | NaN | NaN | NaN | None | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN |
restaurants_eda.drop(['attributes'],axis = 1, inplace = True)
attributes.shape
(52267, 39)
In order to avoid subjetivity, we drop the columns GoodForKids, RestaurantsGoodForGroups, BestNights, NoiseLevel, GoodForDancing, GoodForMeal, and Ambience.
attributes = attributes.drop(['GoodForKids','RestaurantsGoodForGroups','BestNights',
'NoiseLevel','GoodForDancing','Ambience','GoodForMeal'], axis=1)
#Evaluating the null elements in the dataset:
pd_null_columnas = pd.DataFrame(attributes.isnull().sum().sort_values(ascending=False), columns=['nulos_columnas'])
pd_null_filas = pd.DataFrame(attributes.isnull().sum(axis=1).sort_values(ascending=False), columns=['nulos_filas'])
#Getting the proportions:
pd_null_columnas['Proporcion_columnas'] = pd_null_columnas['nulos_columnas']/attributes.shape[0]
pd_null_filas['Proporcion_filas'] = pd_null_filas['nulos_filas']/attributes.shape[1]
# nulls (columns) display:
pd_null_columnas
| nulos_columnas | Proporcion_columnas | |
|---|---|---|
| HairSpecializesIn | 52265 | 0.999962 |
| RestaurantsCounterService | 52249 | 0.999656 |
| AcceptsInsurance | 52248 | 0.999636 |
| Open24Hours | 52246 | 0.999598 |
| DietaryRestrictions | 52237 | 0.999426 |
| AgesAllowed | 52179 | 0.998316 |
| BYOBCorkage | 50825 | 0.972411 |
| ByAppointmentOnly | 48921 | 0.935983 |
| Corkage | 48835 | 0.934337 |
| Smoking | 48820 | 0.934050 |
| BYOB | 48045 | 0.919222 |
| CoatCheck | 47987 | 0.918113 |
| Music | 46704 | 0.893566 |
| DriveThru | 46032 | 0.880709 |
| BusinessAcceptsBitcoin | 45536 | 0.871219 |
| DogsAllowed | 40790 | 0.780416 |
| HappyHour | 39258 | 0.751105 |
| WheelchairAccessible | 38712 | 0.740659 |
| RestaurantsTableService | 32530 | 0.622381 |
| Caters | 17697 | 0.338588 |
| BikeParking | 16784 | 0.321120 |
| WiFi | 14519 | 0.277785 |
| RestaurantsAttire | 13276 | 0.254003 |
| Alcohol | 12287 | 0.235081 |
| HasTV | 10350 | 0.198022 |
| RestaurantsReservations | 9161 | 0.175273 |
| OutdoorSeating | 8628 | 0.165075 |
| RestaurantsPriceRange2 | 7768 | 0.148622 |
| BusinessAcceptsCreditCards | 6710 | 0.128379 |
| BusinessParking | 6657 | 0.127365 |
| RestaurantsDelivery | 4613 | 0.088258 |
| RestaurantsTakeOut | 3555 | 0.068016 |
We have decided to keep those attributes that have less than 25% null values.
attributes = attributes.loc[:,(pd_null_columnas.index[pd_null_columnas.Proporcion_columnas<0.25]).tolist()]
#Evaluating the null elements in the dataset:
pd_null_columnas = pd.DataFrame(attributes.isnull().sum().sort_values(ascending=False), columns=['nulos_columnas'])
pd_null_filas = pd.DataFrame(attributes.isnull().sum(axis=1).sort_values(ascending=False), columns=['nulos_filas'])
#Getting the proportions:
pd_null_columnas['Proporcion_columnas'] = pd_null_columnas['nulos_columnas']/attributes.shape[0]
pd_null_filas['Proporcion_filas'] = pd_null_filas['nulos_filas']/attributes.shape[1]
# nulls (columns) display:
pd_null_columnas
| nulos_columnas | Proporcion_columnas | |
|---|---|---|
| Alcohol | 12287 | 0.235081 |
| HasTV | 10350 | 0.198022 |
| RestaurantsReservations | 9161 | 0.175273 |
| OutdoorSeating | 8628 | 0.165075 |
| RestaurantsPriceRange2 | 7768 | 0.148622 |
| BusinessAcceptsCreditCards | 6710 | 0.128379 |
| BusinessParking | 6657 | 0.127365 |
| RestaurantsDelivery | 4613 | 0.088258 |
| RestaurantsTakeOut | 3555 | 0.068016 |
# nulls (rows) display:
pd_null_filas.groupby('nulos_filas').count()
| Proporcion_filas | |
|---|---|
| nulos_filas | |
| 0 | 31860 |
| 1 | 6262 |
| 2 | 3055 |
| 3 | 2548 |
| 4 | 2516 |
| 5 | 1609 |
| 6 | 1532 |
| 7 | 1398 |
| 8 | 757 |
| 9 | 730 |
attributes.head()
| Alcohol | HasTV | RestaurantsReservations | OutdoorSeating | RestaurantsPriceRange2 | BusinessAcceptsCreditCards | BusinessParking | RestaurantsDelivery | RestaurantsTakeOut | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | u'none' | NaN | NaN | False | 1 | False | {'garage': False, 'street': True, 'validated':... | False | True |
| 1 | u'none' | True | False | True | 1 | True | None | True | True |
| 2 | u'full_bar' | True | False | True | 1 | True | {'garage': False, 'street': False, 'validated'... | False | True |
| 3 | u'none' | True | False | True | 1 | True | {'garage': False, 'street': False, 'validated'... | True | True |
| 4 | 'none' | NaN | False | None | NaN | NaN | {'garage': False, 'street': False, 'validated'... | NaN | NaN |
attributes.shape
(52267, 9)
attributes['Alcohol'].unique()
array(["u'none'", "u'full_bar'", "'none'", "'full_bar'",
"u'beer_and_wine'", nan, "'beer_and_wine'", 'None'], dtype=object)
# Replace the "u'full_bar'", "'full_bar'", "u'beer_and_wine'"" and "'beer_and_wine'" values with 'Yes' and Replace the "u'none'", 'None', and "'none'" values with 'No'
attributes['Alcohol'] = attributes['Alcohol'].replace(
["u'full_bar'", "'full_bar'","u'beer_and_wine'", "'beer_and_wine'"], 'Yes').replace(
["u'none'", "'none'", 'None'], 'No')
# Fill nan with 'Unknown'
attributes['Alcohol'] = attributes['Alcohol'].fillna('Unknown')
attributes['HasTV'].unique()
array([nan, 'True', 'False', 'None'], dtype=object)
# Replace the 'True' values with 'Yes' and the 'False' and 'None' values with 'No'
attributes['HasTV'] = attributes['HasTV'].replace(
'True', 'Yes').replace(
['False', 'None'], 'No')
# Fill nan with 'Unknown'
attributes['HasTV'] = attributes['HasTV'].fillna('Unknown')
attributes['RestaurantsReservations'].unique()
array([nan, 'False', 'True', 'None'], dtype=object)
# Replace the 'True' values with 'Yes' and the 'False' and 'None' values with 'No'
attributes['RestaurantsReservations'] = attributes['RestaurantsReservations'].replace(
'True', 'Yes').replace(
['False', 'None'], 'No')
# Fill nan with 'Unknown'
attributes['RestaurantsReservations'] = attributes['RestaurantsReservations'].fillna('Unknown')
attributes['OutdoorSeating'].unique()
array(['False', 'True', 'None', nan], dtype=object)
# Replace the 'True' values with 'Yes' and the 'False' and 'None' values with 'No'
attributes['OutdoorSeating'] = attributes['OutdoorSeating'].replace(
'True', 'Yes').replace(
['False', 'None'], 'No')
# Fill nan with 'Unknown'
attributes['OutdoorSeating'] = attributes['OutdoorSeating'].fillna('Unknown')
attributes['RestaurantsPriceRange2'].unique()
array(['1', nan, '2', '3', '4', 'None'], dtype=object)
# Replace the 1 and 2 values with 'Cheap', the 3 and 4 values with 'Expensive' and the 'None' values with Nan
attributes['RestaurantsPriceRange2'] = pd.to_numeric(attributes['RestaurantsPriceRange2'], errors='coerce')
attributes['RestaurantsPriceRange2'] = attributes['RestaurantsPriceRange2'].replace(
[1,2], 'Cheap').replace(
[3,4], 'Expensive')
# Fill nan with 'Unknown'
attributes['RestaurantsPriceRange2'] = attributes['RestaurantsPriceRange2'].fillna('Unknown')
attributes['BusinessAcceptsCreditCards'].unique()
array(['False', 'True', nan, 'None'], dtype=object)
# Replace the 'True' values with 'Yes' and the 'False' and 'None' values with 'No'
attributes['BusinessAcceptsCreditCards'] = attributes['BusinessAcceptsCreditCards'].replace(
'True', 'Yes').replace(
['False', 'None'], 'No')
# Fill nan with 'Unknown'
attributes['BusinessAcceptsCreditCards'] = attributes['BusinessAcceptsCreditCards'].fillna('Unknown')
attributes['RestaurantsDelivery'].unique()
array(['False', 'True', nan, 'None'], dtype=object)
# Replace the 'True' values with 'Yes' and the 'False' and 'None' values with 'No'
attributes['RestaurantsDelivery'] = attributes['RestaurantsDelivery'].replace(
'True', 'Yes').replace(
['False', 'None'], 'No')
# Fill nan with 'Unknown'
attributes['RestaurantsDelivery'] = attributes['RestaurantsDelivery'].fillna('Unknown')
attributes['RestaurantsTakeOut'].unique()
array(['True', nan, 'None', 'False'], dtype=object)
# Replace the 'True' values with 'Yes' and the 'False' and 'None' values with 'No'
attributes['RestaurantsTakeOut'] = attributes['RestaurantsTakeOut'].replace(
'True', 'Yes').replace(
['False', 'None'], 'No')
# Fill nan with 'Unknown'
attributes['RestaurantsTakeOut'] = attributes['RestaurantsTakeOut'].fillna('Unknown')
This attribute has the values: street, lot, garage, validated and valet. We consider 'garage', 'validated', 'valet' better than the others so we recode the varible to premium parking or not.
# We pass each value to a function to unpack the dictionary and join the keys if true.
attributes['BusinessParking'] = attributes['BusinessParking'].apply(lambda x: extract_true_values(ast.literal_eval(x))
if isinstance(x, str) else pd.np.nan)
attributes['BusinessParking'].unique()
array(['street', 'None', 'lot', nan, 'street + lot',
'garage + validated + valet', 'garage', 'garage + street',
'validated', 'street + valet', 'lot + valet',
'garage + street + validated + lot + valet',
'garage + street + lot', 'street + validated + lot',
'garage + lot', 'valet + garage', 'valet', 'street + lot + valet',
'garage + valet', 'street + lot + validated',
'garage + street + validated + lot', 'garage + validated',
'garage + street + validated', 'street + validated + valet',
'garage + street + valet', 'validated + lot',
'garage + lot + valet', 'street + validated',
'garage + street + lot + valet', 'garage + validated + lot',
'valet + street + lot', 'garage + street + validated + valet',
'validated + valet', 'validated + lot + valet',
'valet + validated', 'valet + garage + street',
'street + validated + lot + valet', 'valet + lot',
'valet + street + validated', 'valet + street',
'garage + validated + lot + valet', 'valet + garage + lot',
'lot + validated', 'valet + garage + street + lot',
'valet + street + lot + validated',
'garage + street + lot + validated'], dtype=object)
attributes['Premium_Parking'] = attributes['BusinessParking'].apply(has_target_words)
attributes['Premium_Parking'].unique()
array(['No', 'Unknown', 'Yes'], dtype=object)
attributes = attributes.drop('BusinessParking',axis=1)
attributes.shape
(52267, 9)
restaurants_eda.shape
(52267, 39)
restaurants_eda = pd.concat([restaurants_eda, attributes], axis=1)
restaurants_eda.head()
| name | address | city | state | postal_code | latitude | longitude | stars | review_count | is_open | American | Breakfast | Traditional | Sandwiches | Burgers | Mexican | Italian | Seafood | Chicken | Chinese | ... | Thai | Southern | Cajun/Creole | Bars | Breweries | Vegan | Vietnamese | Indian | Greek | Buffets | Other | Alcohol | HasTV | RestaurantsReservations | OutdoorSeating | RestaurantsPriceRange2 | BusinessAcceptsCreditCards | RestaurantsDelivery | RestaurantsTakeOut | Premium_Parking | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | St Honore Pastries | 935 Race St | Philadelphia | PA | 19107 | 39.955505 | -75.155563 | 4.0 | 80 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | No | Unknown | Unknown | No | Cheap | No | No | Yes | No |
| 1 | Sonic Drive-In | 615 S Main St | Ashland City | TN | 37015 | 36.269592 | -87.058945 | 2.0 | 6 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No | Yes | No | Yes | Cheap | Yes | Yes | Yes | No |
| 2 | Tsevi's Pub And Grill | 8025 Mackenzie Rd | Affton | MO | 63123 | 38.565166 | -90.321083 | 3.0 | 19 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | Yes | Yes | No | Yes | Cheap | Yes | No | Yes | No |
| 3 | Sonic Drive-In | 2312 Dickerson Pike | Nashville | TN | 37207 | 36.208103 | -86.768173 | 1.5 | 10 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No | Yes | No | Yes | Cheap | Yes | Yes | Yes | Unknown |
| 4 | Vietnamese Food Truck | Tampa Bay | FL | 33602 | 27.955269 | -82.456322 | 4.0 | 10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | No | Unknown | No | No | Unknown | Unknown | Unknown | Unknown | Unknown |
5 rows × 48 columns
We are going to save the data extrated in a parquet format.
# list of dataframes to convert
dataframes=[restaurants_eda]
# List of file names
filenames =["../data/restaurants_cleaned.parquet" ]
# Convert each dataframe to a Parquet file
save_dfs_to_parquet(dataframes, filenames)
# Matriz con valores de la V de Cramer
df = restaurants_eda.copy()
# define the list of column names you want to exclude
excluded_columns = ['name', 'address', 'stars','is_open','review_count',
'postal_code','latitude','longitude']
# get a list of all column names in the data frame
all_columns = df.columns.tolist()
# use the difference method to get the column names that are not in the excluded_columns list
selected_columns = list(set(all_columns).difference(set(excluded_columns)))
selected_columns
rows= []
for var1 in selected_columns:
col = []
for var2 in selected_columns :
cramers =cramers_corrected_stat(df[var1], df[var2]) # Test V de Cramer
col.append(round(cramers,2))
rows.append(col)
cramers_results = np.array(rows)
cramer_matrix = pd.DataFrame(cramers_results, columns = selected_columns, index = selected_columns)
cramer_matrix
| Cajun/Creole | Breweries | Vietnamese | Vegan | Greek | Southern | Chicken | Fast | Alcohol | state | RestaurantsReservations | HasTV | American | Chinese | Vegetarian | Breakfast | Sandwiches | Italian | Steakhouses | Traditional | Mexican | BusinessAcceptsCreditCards | RestaurantsPriceRange2 | RestaurantsDelivery | Meat | Japanese | Asian | Other | OutdoorSeating | Seafood | city | Buffets | Burgers | RestaurantsTakeOut | Mediterranean | Soup | Premium_Parking | Thai | Indian | Bars | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Cajun/Creole | 1.00 | 0.01 | 0.01 | 0.01 | 0.02 | 0.12 | 0.00 | 0.02 | 0.08 | 0.26 | 0.04 | 0.02 | 0.01 | 0.03 | 0.02 | 0.00 | 0.02 | 0.03 | 0.00 | 0.00 | 0.03 | 0.02 | 0.03 | 0.04 | 0.00 | 0.02 | 0.01 | 0.05 | 0.02 | 0.18 | 0.27 | 0.01 | 0.02 | 0.02 | 0.02 | 0.01 | 0.02 | 0.02 | 0.01 | 0.01 |
| Breweries | 0.01 | 1.00 | 0.01 | 0.01 | 0.01 | 0.00 | 0.01 | 0.04 | 0.08 | 0.04 | 0.03 | 0.03 | 0.07 | 0.02 | 0.01 | 0.02 | 0.01 | 0.02 | 0.01 | 0.05 | 0.02 | 0.01 | 0.02 | 0.02 | 0.00 | 0.02 | 0.01 | 0.04 | 0.06 | 0.01 | 0.00 | 0.01 | 0.00 | 0.00 | 0.01 | 0.01 | 0.04 | 0.01 | 0.01 | 0.08 |
| Vietnamese | 0.01 | 0.01 | 1.00 | 0.00 | 0.02 | 0.01 | 0.03 | 0.06 | 0.02 | 0.09 | 0.03 | 0.02 | 0.06 | 0.06 | 0.02 | 0.04 | 0.01 | 0.04 | 0.02 | 0.05 | 0.04 | 0.04 | 0.02 | 0.01 | 0.01 | 0.00 | 0.10 | 0.05 | 0.06 | 0.02 | 0.07 | 0.01 | 0.04 | 0.01 | 0.02 | 0.06 | 0.02 | 0.07 | 0.01 | 0.03 |
| Vegan | 0.01 | 0.01 | 0.00 | 1.00 | 0.00 | 0.01 | 0.02 | 0.02 | 0.03 | 0.03 | 0.01 | 0.10 | 0.03 | 0.02 | 0.37 | 0.00 | 0.01 | 0.02 | 0.02 | 0.03 | 0.01 | 0.00 | 0.04 | 0.02 | 0.00 | 0.01 | 0.01 | 0.05 | 0.03 | 0.02 | 0.00 | 0.01 | 0.03 | 0.01 | 0.01 | 0.01 | 0.03 | 0.01 | 0.06 | 0.02 |
| Greek | 0.02 | 0.01 | 0.02 | 0.00 | 1.00 | 0.00 | 0.01 | 0.02 | 0.03 | 0.06 | 0.02 | 0.03 | 0.02 | 0.03 | 0.01 | 0.01 | 0.00 | 0.00 | 0.01 | 0.01 | 0.03 | 0.01 | 0.01 | 0.02 | 0.00 | 0.02 | 0.02 | 0.05 | 0.03 | 0.00 | 0.06 | 0.01 | 0.03 | 0.02 | 0.40 | 0.00 | 0.02 | 0.02 | 0.01 | 0.04 |
| Southern | 0.12 | 0.00 | 0.01 | 0.01 | 0.00 | 1.00 | 0.05 | 0.03 | 0.04 | 0.14 | 0.02 | 0.03 | 0.07 | 0.03 | 0.01 | 0.06 | 0.01 | 0.03 | 0.00 | 0.07 | 0.03 | 0.02 | 0.00 | 0.01 | 0.00 | 0.03 | 0.02 | 0.06 | 0.02 | 0.04 | 0.13 | 0.00 | 0.01 | 0.01 | 0.01 | 0.01 | 0.04 | 0.02 | 0.01 | 0.00 |
| Chicken | 0.00 | 0.01 | 0.03 | 0.02 | 0.01 | 0.05 | 1.00 | 0.05 | 0.07 | 0.06 | 0.10 | 0.05 | 0.03 | 0.05 | 0.03 | 0.06 | 0.07 | 0.06 | 0.00 | 0.07 | 0.07 | 0.03 | 0.05 | 0.12 | 0.01 | 0.04 | 0.02 | 0.11 | 0.07 | 0.00 | 0.10 | 0.01 | 0.01 | 0.04 | 0.02 | 0.02 | 0.07 | 0.03 | 0.02 | 0.02 |
| Fast | 0.02 | 0.04 | 0.06 | 0.02 | 0.02 | 0.03 | 0.05 | 1.00 | 0.22 | 0.10 | 0.20 | 0.05 | 0.04 | 0.10 | 0.02 | 0.68 | 0.11 | 0.10 | 0.05 | 0.07 | 0.01 | 0.03 | 0.09 | 0.15 | 0.02 | 0.09 | 0.06 | 0.22 | 0.01 | 0.07 | 0.13 | 0.03 | 0.28 | 0.07 | 0.04 | 0.00 | 0.04 | 0.07 | 0.06 | 0.14 |
| Alcohol | 0.08 | 0.08 | 0.02 | 0.03 | 0.03 | 0.04 | 0.07 | 0.22 | 1.00 | 0.17 | 0.49 | 0.46 | 0.27 | 0.09 | 0.06 | 0.09 | 0.16 | 0.08 | 0.15 | 0.19 | 0.05 | 0.31 | 0.32 | 0.30 | 0.02 | 0.06 | 0.04 | 0.20 | 0.49 | 0.13 | 0.21 | 0.03 | 0.06 | 0.25 | 0.04 | 0.05 | 0.31 | 0.04 | 0.02 | 0.43 |
| state | 0.26 | 0.04 | 0.09 | 0.03 | 0.06 | 0.14 | 0.06 | 0.10 | 0.17 | 1.00 | 0.08 | 0.06 | 0.09 | 0.06 | 0.03 | 0.05 | 0.04 | 0.10 | 0.03 | 0.08 | 0.15 | 0.39 | 0.07 | 0.08 | 0.00 | 0.04 | 0.03 | 0.07 | 0.12 | 0.12 | 0.95 | 0.06 | 0.08 | 0.05 | 0.03 | 0.01 | 0.09 | 0.02 | 0.06 | 0.06 |
| RestaurantsReservations | 0.04 | 0.03 | 0.03 | 0.01 | 0.02 | 0.02 | 0.10 | 0.20 | 0.49 | 0.08 | 1.00 | 0.42 | 0.16 | 0.04 | 0.03 | 0.06 | 0.16 | 0.13 | 0.14 | 0.09 | 0.05 | 0.32 | 0.39 | 0.34 | 0.03 | 0.13 | 0.09 | 0.18 | 0.41 | 0.13 | 0.13 | 0.04 | 0.14 | 0.30 | 0.06 | 0.04 | 0.24 | 0.10 | 0.09 | 0.22 |
| HasTV | 0.02 | 0.03 | 0.02 | 0.10 | 0.03 | 0.03 | 0.05 | 0.05 | 0.46 | 0.06 | 0.42 | 1.00 | 0.14 | 0.03 | 0.10 | 0.12 | 0.01 | 0.05 | 0.07 | 0.12 | 0.06 | 0.26 | 0.27 | 0.29 | 0.03 | 0.04 | 0.04 | 0.19 | 0.43 | 0.05 | 0.11 | 0.03 | 0.07 | 0.26 | 0.04 | 0.04 | 0.22 | 0.05 | 0.04 | 0.16 |
| American | 0.01 | 0.07 | 0.06 | 0.03 | 0.02 | 0.07 | 0.03 | 0.04 | 0.27 | 0.09 | 0.16 | 0.14 | 1.00 | 0.12 | 0.00 | 0.15 | 0.06 | 0.09 | 0.09 | 0.74 | 0.09 | 0.09 | 0.09 | 0.12 | 0.02 | 0.10 | 0.05 | 0.23 | 0.18 | 0.06 | 0.11 | 0.01 | 0.09 | 0.06 | 0.02 | 0.02 | 0.15 | 0.07 | 0.06 | 0.25 |
| Chinese | 0.03 | 0.02 | 0.06 | 0.02 | 0.03 | 0.03 | 0.05 | 0.10 | 0.09 | 0.06 | 0.04 | 0.03 | 0.12 | 1.00 | 0.01 | 0.09 | 0.11 | 0.07 | 0.04 | 0.09 | 0.08 | 0.02 | 0.04 | 0.06 | 0.01 | 0.10 | 0.20 | 0.10 | 0.17 | 0.01 | 0.03 | 0.09 | 0.09 | 0.05 | 0.04 | 0.04 | 0.05 | 0.07 | 0.02 | 0.07 |
| Vegetarian | 0.02 | 0.01 | 0.02 | 0.37 | 0.01 | 0.01 | 0.03 | 0.02 | 0.06 | 0.03 | 0.03 | 0.10 | 0.00 | 0.01 | 1.00 | 0.01 | 0.01 | 0.03 | 0.02 | 0.01 | 0.00 | 0.02 | 0.01 | 0.04 | 0.01 | 0.01 | 0.03 | 0.06 | 0.04 | 0.00 | 0.00 | 0.04 | 0.03 | 0.03 | 0.04 | 0.09 | 0.05 | 0.02 | 0.11 | 0.01 |
| Breakfast | 0.00 | 0.02 | 0.04 | 0.00 | 0.01 | 0.06 | 0.06 | 0.68 | 0.09 | 0.05 | 0.06 | 0.12 | 0.15 | 0.09 | 0.01 | 1.00 | 0.07 | 0.07 | 0.02 | 0.15 | 0.03 | 0.04 | 0.03 | 0.04 | 0.01 | 0.07 | 0.05 | 0.15 | 0.07 | 0.03 | 0.05 | 0.01 | 0.02 | 0.03 | 0.02 | 0.04 | 0.08 | 0.05 | 0.04 | 0.03 |
| Sandwiches | 0.02 | 0.01 | 0.01 | 0.01 | 0.00 | 0.01 | 0.07 | 0.11 | 0.16 | 0.04 | 0.16 | 0.01 | 0.06 | 0.11 | 0.01 | 0.07 | 1.00 | 0.02 | 0.03 | 0.03 | 0.11 | 0.03 | 0.07 | 0.11 | 0.03 | 0.08 | 0.06 | 0.18 | 0.02 | 0.04 | 0.07 | 0.04 | 0.01 | 0.08 | 0.01 | 0.12 | 0.04 | 0.06 | 0.05 | 0.09 |
| Italian | 0.03 | 0.02 | 0.04 | 0.02 | 0.00 | 0.03 | 0.06 | 0.10 | 0.08 | 0.10 | 0.13 | 0.05 | 0.09 | 0.07 | 0.03 | 0.07 | 0.02 | 1.00 | 0.01 | 0.08 | 0.09 | 0.03 | 0.06 | 0.07 | 0.01 | 0.05 | 0.04 | 0.13 | 0.06 | 0.00 | 0.14 | 0.02 | 0.08 | 0.04 | 0.01 | 0.00 | 0.04 | 0.04 | 0.03 | 0.04 |
| Steakhouses | 0.00 | 0.01 | 0.02 | 0.02 | 0.01 | 0.00 | 0.00 | 0.05 | 0.15 | 0.03 | 0.14 | 0.07 | 0.09 | 0.04 | 0.02 | 0.02 | 0.03 | 0.01 | 1.00 | 0.10 | 0.04 | 0.02 | 0.21 | 0.04 | 0.00 | 0.04 | 0.01 | 0.07 | 0.05 | 0.16 | 0.03 | 0.01 | 0.03 | 0.03 | 0.01 | 0.02 | 0.10 | 0.02 | 0.02 | 0.09 |
| Traditional | 0.00 | 0.05 | 0.05 | 0.03 | 0.01 | 0.07 | 0.07 | 0.07 | 0.19 | 0.08 | 0.09 | 0.12 | 0.74 | 0.09 | 0.01 | 0.15 | 0.03 | 0.08 | 0.10 | 1.00 | 0.10 | 0.07 | 0.05 | 0.08 | 0.01 | 0.08 | 0.05 | 0.17 | 0.12 | 0.04 | 0.11 | 0.00 | 0.13 | 0.04 | 0.03 | 0.01 | 0.10 | 0.05 | 0.05 | 0.20 |
| Mexican | 0.03 | 0.02 | 0.04 | 0.01 | 0.03 | 0.03 | 0.07 | 0.01 | 0.05 | 0.15 | 0.05 | 0.06 | 0.09 | 0.08 | 0.00 | 0.03 | 0.11 | 0.09 | 0.04 | 0.10 | 1.00 | 0.02 | 0.06 | 0.00 | 0.00 | 0.05 | 0.04 | 0.13 | 0.04 | 0.02 | 0.14 | 0.02 | 0.06 | 0.01 | 0.04 | 0.03 | 0.02 | 0.04 | 0.03 | 0.05 |
| BusinessAcceptsCreditCards | 0.02 | 0.01 | 0.04 | 0.00 | 0.01 | 0.02 | 0.03 | 0.03 | 0.31 | 0.39 | 0.32 | 0.26 | 0.09 | 0.02 | 0.02 | 0.04 | 0.03 | 0.03 | 0.02 | 0.07 | 0.02 | 1.00 | 0.33 | 0.24 | 0.01 | 0.02 | 0.02 | 0.07 | 0.31 | 0.02 | 0.40 | 0.01 | 0.03 | 0.27 | 0.01 | 0.02 | 0.20 | 0.01 | 0.03 | 0.07 |
| RestaurantsPriceRange2 | 0.03 | 0.02 | 0.02 | 0.04 | 0.01 | 0.00 | 0.05 | 0.09 | 0.32 | 0.07 | 0.39 | 0.27 | 0.09 | 0.04 | 0.01 | 0.03 | 0.07 | 0.06 | 0.21 | 0.05 | 0.06 | 0.33 | 1.00 | 0.23 | 0.00 | 0.01 | 0.00 | 0.04 | 0.33 | 0.10 | 0.13 | 0.02 | 0.07 | 0.28 | 0.02 | 0.02 | 0.23 | 0.02 | 0.02 | 0.06 |
| RestaurantsDelivery | 0.04 | 0.02 | 0.01 | 0.02 | 0.02 | 0.01 | 0.12 | 0.15 | 0.30 | 0.08 | 0.34 | 0.29 | 0.12 | 0.06 | 0.04 | 0.04 | 0.11 | 0.07 | 0.04 | 0.08 | 0.00 | 0.24 | 0.23 | 1.00 | 0.03 | 0.03 | 0.03 | 0.14 | 0.27 | 0.03 | 0.13 | 0.03 | 0.11 | 0.48 | 0.03 | 0.05 | 0.15 | 0.02 | 0.04 | 0.10 |
| Meat | 0.00 | 0.00 | 0.01 | 0.00 | 0.00 | 0.00 | 0.01 | 0.02 | 0.02 | 0.00 | 0.03 | 0.03 | 0.02 | 0.01 | 0.01 | 0.01 | 0.03 | 0.01 | 0.00 | 0.01 | 0.00 | 0.01 | 0.00 | 0.03 | 1.00 | 0.01 | 0.01 | 0.03 | 0.03 | 0.01 | 0.10 | 0.01 | 0.01 | 0.00 | 0.00 | 0.00 | 0.01 | 0.01 | 0.00 | 0.03 |
| Japanese | 0.02 | 0.02 | 0.00 | 0.01 | 0.02 | 0.03 | 0.04 | 0.09 | 0.06 | 0.04 | 0.13 | 0.04 | 0.10 | 0.10 | 0.01 | 0.07 | 0.08 | 0.05 | 0.04 | 0.08 | 0.05 | 0.02 | 0.01 | 0.03 | 0.01 | 1.00 | 0.23 | 0.08 | 0.09 | 0.00 | 0.00 | 0.04 | 0.06 | 0.04 | 0.03 | 0.00 | 0.03 | 0.08 | 0.02 | 0.20 |
| Asian | 0.01 | 0.01 | 0.10 | 0.01 | 0.02 | 0.02 | 0.02 | 0.06 | 0.04 | 0.03 | 0.09 | 0.04 | 0.05 | 0.20 | 0.03 | 0.05 | 0.06 | 0.04 | 0.01 | 0.05 | 0.04 | 0.02 | 0.00 | 0.03 | 0.01 | 0.23 | 1.00 | 0.07 | 0.04 | 0.01 | 0.00 | 0.03 | 0.05 | 0.03 | 0.01 | 0.03 | 0.04 | 0.19 | 0.01 | 0.06 |
| Other | 0.05 | 0.04 | 0.05 | 0.05 | 0.05 | 0.06 | 0.11 | 0.22 | 0.20 | 0.07 | 0.18 | 0.19 | 0.23 | 0.10 | 0.06 | 0.15 | 0.18 | 0.13 | 0.07 | 0.17 | 0.13 | 0.07 | 0.04 | 0.14 | 0.03 | 0.08 | 0.07 | 1.00 | 0.14 | 0.11 | 0.09 | 0.04 | 0.14 | 0.10 | 0.06 | 0.06 | 0.08 | 0.06 | 0.05 | 0.21 |
| OutdoorSeating | 0.02 | 0.06 | 0.06 | 0.03 | 0.03 | 0.02 | 0.07 | 0.01 | 0.49 | 0.12 | 0.41 | 0.43 | 0.18 | 0.17 | 0.04 | 0.07 | 0.02 | 0.06 | 0.05 | 0.12 | 0.04 | 0.31 | 0.33 | 0.27 | 0.03 | 0.09 | 0.04 | 0.14 | 1.00 | 0.04 | 0.19 | 0.07 | 0.02 | 0.27 | 0.05 | 0.03 | 0.26 | 0.07 | 0.04 | 0.20 |
| Seafood | 0.18 | 0.01 | 0.02 | 0.02 | 0.00 | 0.04 | 0.00 | 0.07 | 0.13 | 0.12 | 0.13 | 0.05 | 0.06 | 0.01 | 0.00 | 0.03 | 0.04 | 0.00 | 0.16 | 0.04 | 0.02 | 0.02 | 0.10 | 0.03 | 0.01 | 0.00 | 0.01 | 0.11 | 0.04 | 1.00 | 0.16 | 0.00 | 0.04 | 0.02 | 0.00 | 0.03 | 0.09 | 0.01 | 0.01 | 0.05 |
| city | 0.27 | 0.00 | 0.07 | 0.00 | 0.06 | 0.13 | 0.10 | 0.13 | 0.21 | 0.95 | 0.13 | 0.11 | 0.11 | 0.03 | 0.00 | 0.05 | 0.07 | 0.14 | 0.03 | 0.11 | 0.14 | 0.40 | 0.13 | 0.13 | 0.10 | 0.00 | 0.00 | 0.09 | 0.19 | 0.16 | 1.00 | 0.00 | 0.14 | 0.07 | 0.06 | 0.01 | 0.16 | 0.00 | 0.11 | 0.11 |
| Buffets | 0.01 | 0.01 | 0.01 | 0.01 | 0.01 | 0.00 | 0.01 | 0.03 | 0.03 | 0.06 | 0.04 | 0.03 | 0.01 | 0.09 | 0.04 | 0.01 | 0.04 | 0.02 | 0.01 | 0.00 | 0.02 | 0.01 | 0.02 | 0.03 | 0.01 | 0.04 | 0.03 | 0.04 | 0.07 | 0.00 | 0.00 | 1.00 | 0.03 | 0.02 | 0.01 | 0.01 | 0.01 | 0.01 | 0.13 | 0.01 |
| Burgers | 0.02 | 0.00 | 0.04 | 0.03 | 0.03 | 0.01 | 0.01 | 0.28 | 0.06 | 0.08 | 0.14 | 0.07 | 0.09 | 0.09 | 0.03 | 0.02 | 0.01 | 0.08 | 0.03 | 0.13 | 0.06 | 0.03 | 0.07 | 0.11 | 0.01 | 0.06 | 0.05 | 0.14 | 0.02 | 0.04 | 0.14 | 0.03 | 1.00 | 0.05 | 0.04 | 0.04 | 0.04 | 0.05 | 0.04 | 0.01 |
| RestaurantsTakeOut | 0.02 | 0.00 | 0.01 | 0.01 | 0.02 | 0.01 | 0.04 | 0.07 | 0.25 | 0.05 | 0.30 | 0.26 | 0.06 | 0.05 | 0.03 | 0.03 | 0.08 | 0.04 | 0.03 | 0.04 | 0.01 | 0.27 | 0.28 | 0.48 | 0.00 | 0.04 | 0.03 | 0.10 | 0.27 | 0.02 | 0.07 | 0.02 | 0.05 | 1.00 | 0.02 | 0.03 | 0.16 | 0.03 | 0.03 | 0.05 |
| Mediterranean | 0.02 | 0.01 | 0.02 | 0.01 | 0.40 | 0.01 | 0.02 | 0.04 | 0.04 | 0.03 | 0.06 | 0.04 | 0.02 | 0.04 | 0.04 | 0.02 | 0.01 | 0.01 | 0.01 | 0.03 | 0.04 | 0.01 | 0.02 | 0.03 | 0.00 | 0.03 | 0.01 | 0.06 | 0.05 | 0.00 | 0.06 | 0.01 | 0.04 | 0.02 | 1.00 | 0.00 | 0.03 | 0.02 | 0.01 | 0.01 |
| Soup | 0.01 | 0.01 | 0.06 | 0.01 | 0.00 | 0.01 | 0.02 | 0.00 | 0.05 | 0.01 | 0.04 | 0.04 | 0.02 | 0.04 | 0.09 | 0.04 | 0.12 | 0.00 | 0.02 | 0.01 | 0.03 | 0.02 | 0.02 | 0.05 | 0.00 | 0.00 | 0.03 | 0.06 | 0.03 | 0.03 | 0.01 | 0.01 | 0.04 | 0.03 | 0.00 | 1.00 | 0.02 | 0.03 | 0.00 | 0.01 |
| Premium_Parking | 0.02 | 0.04 | 0.02 | 0.03 | 0.02 | 0.04 | 0.07 | 0.04 | 0.31 | 0.09 | 0.24 | 0.22 | 0.15 | 0.05 | 0.05 | 0.08 | 0.04 | 0.04 | 0.10 | 0.10 | 0.02 | 0.20 | 0.23 | 0.15 | 0.01 | 0.03 | 0.04 | 0.08 | 0.26 | 0.09 | 0.16 | 0.01 | 0.04 | 0.16 | 0.03 | 0.02 | 1.00 | 0.03 | 0.02 | 0.18 |
| Thai | 0.02 | 0.01 | 0.07 | 0.01 | 0.02 | 0.02 | 0.03 | 0.07 | 0.04 | 0.02 | 0.10 | 0.05 | 0.07 | 0.07 | 0.02 | 0.05 | 0.06 | 0.04 | 0.02 | 0.05 | 0.04 | 0.01 | 0.02 | 0.02 | 0.01 | 0.08 | 0.19 | 0.06 | 0.07 | 0.01 | 0.00 | 0.01 | 0.05 | 0.03 | 0.02 | 0.03 | 0.03 | 1.00 | 0.00 | 0.01 |
| Indian | 0.01 | 0.01 | 0.01 | 0.06 | 0.01 | 0.01 | 0.02 | 0.06 | 0.02 | 0.06 | 0.09 | 0.04 | 0.06 | 0.02 | 0.11 | 0.04 | 0.05 | 0.03 | 0.02 | 0.05 | 0.03 | 0.03 | 0.02 | 0.04 | 0.00 | 0.02 | 0.01 | 0.05 | 0.04 | 0.01 | 0.11 | 0.13 | 0.04 | 0.03 | 0.01 | 0.00 | 0.02 | 0.00 | 1.00 | 0.05 |
| Bars | 0.01 | 0.08 | 0.03 | 0.02 | 0.04 | 0.00 | 0.02 | 0.14 | 0.43 | 0.06 | 0.22 | 0.16 | 0.25 | 0.07 | 0.01 | 0.03 | 0.09 | 0.04 | 0.09 | 0.20 | 0.05 | 0.07 | 0.06 | 0.10 | 0.03 | 0.20 | 0.06 | 0.21 | 0.20 | 0.05 | 0.11 | 0.01 | 0.01 | 0.05 | 0.01 | 0.01 | 0.18 | 0.01 | 0.05 | 1.00 |
# Converting the correlation matrix to a triangular one
mask = np.zeros_like(cramer_matrix, dtype='bool')
mask[np.triu_indices_from(mask)] = True
# Plotting the final matrix:
fig, ax = plt.subplots(figsize=(23, 14))
ax.plot()
ax = sns.heatmap(cramer_matrix, mask=mask, annot=True, linewidths=0.9, cmap="BuPu",
xticklabels=selected_columns, yticklabels=selected_columns)
As espected, city and state are almost full correlated. All these restaurants are in America so it´s natural that American food and Traditional food are correlated. Categories suchs as Fast and Breakfast are correlated as well as Mediterranean and Greek. About attributes, 'OutdoorSeating' and 'Bars are correlated with 'Alcohol'. Also, RestaurantsTakeOut with RestaurantDelivery.
Through the folium library we have been able to obtain an interactive map grouping the number of restaurants by coordinates. Each group contains the number of restaurants in that particular area and you can interact with it to get the location of each restaurant represented through a marker.
# Create a base map
map_restaurants = fl.Map(location=[39,-96], tiles='cartodbpositron', zoom_start=3)
# Add points to the map
mc = MarkerCluster()
for idx, row in restaurants.iterrows():
if not math.isnan(row['longitude']) and not math.isnan(row['latitude']):
mc.add_child(Marker([row['latitude'], row['longitude']]))
map_restaurants.add_child(mc)
# Display the map
map_restaurants